Stroke Prediction Model
Project in Progress
Preliminary code
Mass Effect Graph provided
Further Modeling and Accuracy needed
# Relevant Packages
library(pacman)
p_load(tidyverse, ggthemes, caret, mice, VIM, rpart, rpart.plot, effects, ggpubr, ROSE, pROC)
# Load Data
stroke <- read_csv("/Users/mohan/Desktop/Projects/26 Projects/Stroke Model/healthcare-dataset-stroke-data.csv") %>%
select(-c(id))
# Convert variables to appropriate data types
stroke <- stroke %>%
mutate(smoking_status = as.factor(smoking_status),
bmi = as.character(bmi),
bmi = gsub("[^0-9.]", "", bmi),
bmi = as.numeric(bmi),
work_type = as.factor(work_type),
hypertension = as.factor(hypertension),
Residence_type = as.factor(Residence_type),
stroke = as.factor(stroke),
ever_married = as.factor(ever_married),
heart_disease = as.factor(heart_disease),
gender = factor(gender, levels = c("Male", "Female")))
# Handle missing values in bmi
stroke <- stroke %>%
mutate(bmi = ifelse(is.na(bmi), mean(bmi, na.rm = TRUE), bmi))
# Data Partitioning
set.seed(123)
index <- createDataPartition(stroke$stroke, p = 0.7, list = FALSE)
train <- stroke[index, ]
test <- stroke[-index, ]
n_majority <- 1000 # Set the desired number of majority class samples
# Balancing the dataset
train_balanced <- ROSE(stroke ~ ., data = train, seed = 123, N = n_majority, p = 0.5)$data
train_balanced$stroke <- factor(train_balanced$stroke, levels = c("0", "1"))
# Model Building
dt_model <- rpart(stroke ~ ., data = train_balanced, method = "class")
# Predict on the test set
predictions <- predict(dt_model, test, type = "class")
# Convert test$stroke to factor with levels "0" and "1"
test$stroke <- factor(test$stroke, levels = c("0", "1"))
# Calculate AUC
roc <- pROC::roc(test$stroke, as.numeric(predictions))
# Imputation
imput <- mice(stroke, maxit = 0)
meth <- imput$method
predM <- imput$predictorMatrix
predM[, "bmi"] <- 0
set.seed(103)
imputed <- mice(stroke, method = meth, predictorMatrix = predM, m = 5)
stroke <- complete(imputed)
# Visualizations
# Gender
gender.dist <- tibble(gender = c("Female", "Male"),
total = as.numeric(table(stroke$gender))) %>%
mutate(gender.percent = round((total / sum(total) * 100))) %>%
ggplot(aes(x = "", y = gender.percent, fill = gender)) +
geom_bar(stat = "identity", color = "black") +
coord_polar("y", start = 0) +
theme_void() +
scale_fill_manual(values = c("#E0BBE4", "#FFDFD3", "#957DAD")) +
geom_text(aes(label = paste0(gender.percent, "%")))
gender.dist
# How many people in the dataset had a stroke
outcomes <- tibble(outcome = c("No Stroke", "Stroke"),
total = as.integer(table(stroke$stroke)))
outcome.dist <- outcomes %>%
mutate(stroke.percent = round((total / sum(total)) * 100)) %>%
mutate(ypos = cumsum(stroke.percent) - 0.5 * stroke.percent)
# Handle missing values in bmi
stroke <- stroke %>%
mutate(bmi = ifelse(is.na(bmi), mean(bmi, na.rm = TRUE), bmi))
# Data Partitioning
set.seed(123)
index <- createDataPartition(stroke$stroke, p = 0.7, list = FALSE)
train <- stroke[index, ]
test <- stroke[-index, ]
# Balancing the dataset
train_balanced <- ROSE(stroke ~ ., data = train, seed = 123, N = n_majority, p = 0.5)$data
train_balanced$stroke <- factor(train_balanced$stroke, levels = c("0", "1"))
# Model Building
dt_model <- rpart(stroke ~ ., data = train_balanced, method = "class")
# Predict on the test set
predictions <- predict(dt_model, test, type = "class")
# Convert test$stroke to factor with levels "0" and "1"
test$stroke <- factor(test$stroke, levels = c("0", "1"))
# Calculate AUC
roc <- pROC::roc(test$stroke, as.numeric(predictions))
# Imputation
imput <- mice(stroke, maxit = 0)
meth <- imput$method
predM <- imput$predictorMatrix
predM[, "bmi"] <- 0
set.seed(103)
imputed <- mice(stroke, method = meth, predictorMatrix = predM, m = 5)
stroke <- complete(imputed)
# Visualizations
# Gender
gender.dist <- tibble(gender = c("Female", "Male"),
total = as.numeric(table(stroke$gender))) %>%
mutate(gender.percent = round((total / sum(total) * 100))) %>%
ggplot(aes(x = "", y = gender.percent, fill = gender)) +
geom_bar(stat = "identity", color = "black") +
coord_polar("y", start = 0) +
theme_void() +
scale_fill_manual(values = c("#E0BBE4", "#FFDFD3", "#957DAD")) +
geom_text(aes(label = paste0(gender.percent, "%")))
gender.dist
# How many people in the dataset had a stroke
outcomes <- tibble(outcome = c("No Stroke", "Stroke"),
total = as.integer(table(stroke$stroke)))
outcome.dist <- outcomes %>%
mutate(stroke.percent = round((total / sum(total)) * 100)) %>%
mutate(ypos = cumsum(stroke.percent) - 0.5 * stroke.percent)
outcome.dist
# Feature Engineering
# Age Binning
stroke$age_group <- cut(stroke$age, breaks = c(0, 18, 40, 60, max(stroke$age)), labels = c("0-18", "19-40", "41-60", "61+"))
# BMI Category
stroke$bmi_category <- cut(stroke$bmi, breaks = c(0, 18.5, 25, 30, max(stroke$bmi)),
labels = c("Underweight", "Normal", "Overweight", "Obese"))
stroke$bmi <- as.numeric(gsub("[^0-9.]", "", trimws(stroke$bmi)))
# Data Visualization
# Age Group Distribution
age_group_dist <- stroke %>%
group_by(age_group) %>%
summarize(total = n()) %>%
mutate(age_group_percent = round((total / sum(total)) * 100)) %>%
ggplot(aes(x = age_group, y = age_group_percent, fill = age_group)) +
geom_bar(stat = "identity", color = "black") +
theme_minimal() +
xlab("Age Group") +
ylab("Percentage") +
ggtitle("Distribution by Age Group") +
geom_text(aes(label = paste0(age_group_percent, "%")), vjust = -0.5) +
theme(legend.position = "none")
age_group_dist
# BMI Category Distribution
bmi_category_dist <- stroke %>%
group_by(bmi_category) %>%
summarize(total = n()) %>%
mutate(bmi_category_percent = round((total / sum(total)) * 100)) %>%
ggplot(aes(x = bmi_category, y = bmi_category_percent, fill = bmi_category)) +
geom_bar(stat = "identity", color = "black") +
theme_minimal() +
xlab("BMI Category") +
ylab("Percentage") +
ggtitle("Distribution by BMI Category") +
geom_text(aes(label = paste0(bmi_category_percent, "%")), vjust = -0.5) +
theme(legend.position = "none")
bmi_category_dist
# Model Evaluation
# Confusion Matrix
confusion_matrix <- confusionMatrix(predictions, test$stroke)
confusion_matrix
# Accuracy
accuracy <- confusion_matrix$overall["Accuracy"]
accuracy
# AUC
auc <- roc$auc
auc
# Sensitivity and Specificity
sensitivity <- confusion_matrix$byClass["Sensitivity"]
specificity <- confusion_matrix$byClass["Specificity"]
sensitivity
specificity
library(ggplot2)
library(gridExtra)
# Demographics based on all categories
gender_plot <- ggplot(stroke, aes(x = gender)) +
geom_bar() +
labs(x = "Gender", y = "Count") +
ggtitle("Distribution of Gender")
age_plot <- ggplot(stroke, aes(x = age)) +
geom_histogram() +
labs(x = "Age", y = "Count") +
ggtitle("Distribution of Age")
ever_married_plot <- ggplot(stroke, aes(x = ever_married)) +
geom_bar() +
labs(x = "Marital Status", y = "Count") +
ggtitle("Distribution of Marital Status")
work_type_plot <- ggplot(stroke, aes(x = work_type)) +
geom_bar() +
labs(x = "Work Type", y = "Count") +
ggtitle("Distribution of Work Type")
residence_type_plot <- ggplot(stroke, aes(x = Residence_type)) +
geom_bar() +
labs(x = "Residence Type", y = "Count") +
ggtitle("Distribution of Residence Type")
# Demographics for those who had a stroke
stroke_subset <- subset(stroke, stroke == 1)
gender_stroke_plot <- ggplot(stroke_subset, aes(x = gender)) +
geom_bar() +
labs(x = "Gender", y = "Count") +
ggtitle("Distribution of Gender for Stroke Cases")
age_stroke_plot <- ggplot(stroke_subset, aes(x = age)) +
geom_histogram() +
labs(x = "Age", y = "Count") +
ggtitle("Distribution of Age for Stroke Cases")
ever_married_stroke_plot <- ggplot(stroke_subset, aes(x = ever_married)) +
geom_bar() +
labs(x = "Marital Status", y = "Count") +
ggtitle("Distribution of Marital Status for Stroke Cases")
work_type_stroke_plot <- ggplot(stroke_subset, aes(x = work_type)) +
geom_bar() +
labs(x = "Work Type", y = "Count") +
ggtitle("Distribution of Work Type for Stroke Cases")
residence_type_stroke_plot <- ggplot(stroke_subset, aes(x = Residence_type)) +
geom_bar() +
labs(x = "Residence Type", y = "Count") +
ggtitle("Distribution of Residence Type for Stroke Cases")
# Clinical metrics
hypertension_plot <- ggplot(stroke, aes(x = hypertension)) +
geom_bar() +
labs(x = "Hypertension", y = "Count") +
ggtitle("Distribution of Hypertension")
heart_disease_plot <- ggplot(stroke, aes(x = heart_disease)) +
geom_bar() +
labs(x = "Heart Disease", y = "Count") +
ggtitle("Distribution of Heart Disease")
bmi_plot <- ggplot(stroke, aes(x = bmi)) +
geom_histogram() +
labs(x = "BMI", y = "Count") +
ggtitle("Distribution of BMI")
glucose_level_plot <- ggplot(stroke, aes(x = avg_glucose_level)) +
geom_histogram() +
labs(x = "Average Glucose Level", y = "Count") +
ggtitle("Distribution of Average Glucose Level")
# First grid arrangement
grid_arrange_1 <- function(...) {
grid.arrange(..., ncol = 2)
}
# Second grid arrangement
grid_arrange_2 <- function(...) {
grid.arrange(..., ncol = 2)
}
# Arrange and display the graphs
grid_arrange_1(
gender_plot, age_plot,
ever_married_plot, work_type_plot,
residence_type_plot, gender_stroke_plot,
age_stroke_plot, ever_married_stroke_plot,
nrow = 4
)
grid_arrange_2(
work_type_stroke_plot, residence_type_stroke_plot,
hypertension_plot, heart_disease_plot,
bmi_plot, glucose_level_plot,
nrow = 4
)
library(caret)
library(ROSE)
library(rpart.plot)
set.seed(123)
# Split the data into training and testing sets
index <- createDataPartition(stroke$stroke, p = 0.8, list = FALSE)
train_data <- stroke[index, ]
test_data <- stroke[-index, ]
# Using ROSE for synthetic data generation (due to imbalanced nature of outcomes)
oversampled_data <- ovun.sample(stroke ~ ., data = train_data, method = "over", N = 2 * nrow(train_data), seed = 123)$data
# Build the binary tree model
tree_model <- rpart(stroke ~ .,
data = oversampled_data, method = "class", control = rpart.control(cp = 0.01))
# Make predictions on the test data
predictions <- predict(tree_model, newdata = test_data, type = "prob")
# Adjust classification threshold for sensitivity improvement
threshold <- 0.
predicted_class <- as.numeric(predictions[, 2] > threshold)
# Calculate misclassification rate
misclassification_rate <- mean(predictions != test_data$stroke)
# Print the binary tree
printcp(tree_model)
# Plot the binary tree
rpart.plot(tree_model)
# Evaluate variable importance
var_importance <- varImp(tree_model, scale = FALSE)
# View variable importance
print(var_importance)
# Predict on test data
test_pred <- predict(tree_model, newdata = test_data, type = "class")
# Evaluate the model
confusion_matrix <- table(test_pred, test_data$stroke)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
misclassification_rate <- 1 - accuracy
# Calculate sensitivity, specificity, and F1 score
sensitivity <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
specificity <- confusion_matrix[1, 1] / sum(confusion_matrix[1, ])
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)
# View the confusion matrix, misclassification rate, sensitivity, specificity, and F1 score
print(confusion_matrix)
print(paste("Misclassification Rate:", round(misclassification_rate, 4)))
print(paste("Sensitivity:", round(sensitivity, 4)))
print(paste("Specificity:", round(specificity, 4)))
print(paste("F1 Score:", round(f1_score, 4)))
library(caret)
library(ROSE)
library(randomForest)
set.seed(123)
# Split the data into training and testing sets
index <- createDataPartition(stroke$stroke, p = 0.8, list = FALSE)
train_data <- stroke[index, ]
test_data <- stroke[-index, ]
# Using ROSE for synthetic data generation (due to imbalanced nature of outcomes)
oversampled_data <- ovun.sample(stroke ~ ., data = train_data, method = "over", N = 2 * nrow(train_data), seed = 123)$data
# Build the Random Forest model
rf_model <- randomForest(stroke ~ ., data = oversampled_data, importance = TRUE)
# Make predictions on the test data
predictions <- predict(rf_model, newdata = test_data)
# Evaluate the model
confusion_matrix <- table(predictions, test_data$stroke)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
misclassification_rate <- 1 - accuracy
sensitivity <- confusion_matrix[2, 2] / sum(confusion_matrix[2, ])
specificity <- confusion_matrix[1, 1] / sum(confusion_matrix[1, ])
# Calculate the F1 score manually
precision <- confusion_matrix[2, 2] / sum(confusion_matrix[, 2])
recall <- sensitivity
f1_score <- 2 * (precision * recall) / (precision + recall)
# View the confusion matrix, performance metrics, and F1 score
print(confusion_matrix)
cat("Misclassification Rate:", misclassification_rate, "\n")
cat("Sensitivity:", sensitivity, "\n")
cat("Specificity:", specificity, "\n")
cat("F1 Score:", f1_score, "\n")
# Apply preprocessing to the training data
preprocessed_data <- preProcess(train_data, method = c("center", "scale"))
# Transform the training data using the preprocessed data
train_data_transformed <- predict(preprocessed_data, train_data)
# Train the logistic regression model
logistic_model <- glm(stroke ~ ., data = train_data_transformed, family = binomial())
# View the summary of the model
summary(logistic_model)
# Apply the same preprocessing to the test data
test_data_transformed <- predict(preprocessed_data, test_data)
# Make predictions on the test data
pred <- predict(logistic_model, newdata = test_data_transformed, type = "response")
# Convert predicted probabilities to class labels
pred_class <- ifelse(pred > 0.5, 1, 0)
# Evaluate the model performance using accuracy
accuracy <- mean(pred_class == test_data_transformed$stroke)
print(accuracy)
# Calculate true positives, true negatives, false positives, and false negatives
tp <- sum(pred_class == 1 & test_data_transformed$stroke == 1)
tn <- sum(pred_class == 0 & test_data_transformed$stroke == 0)
fp <- sum(pred_class == 1 & test_data_transformed$stroke == 0)
fn <- sum(pred_class == 0 & test_data_transformed$stroke == 1)
# Calculate sensitivity (true positive rate)
sensitivity <- tp / (tp + fn)
# Calculate specificity (true negative rate)
specificity <- tn / (tn + fp)
# Calculate precision, recall, and F1 score
precision <- tp / (tp + fp)
precision[is.nan(precision)] <- 0 # Set precision to 0 if NaN
recall <- sensitivity
recall[is.nan(recall)] <- 0 # Set recall to 0 if NaN
f1_score <- ifelse((precision + recall) > 0, 2 * (precision * recall) / (precision + recall), 0)
# Calculate misclassification rate
misclassification_rate <- mean(pred_class != test_data_transformed$stroke)
# Print the results
cat("Misclassification Rate:", misclassification_rate, "\n")
cat("Sensitivity:", sensitivity, "\n")
cat("Specificity:", specificity, "\n")
cat("F1 Score:", f1_score, "\n")
# Fit the logistic regression model
model <- glm(stroke ~ age + hypertension + smoking_status + avg_glucose_level + bmi,
data = train_data_transformed, family = binomial())
# Get the coefficients
coefficients <- coef(model)
predictors <- names(coefficients)[-1] # Exclude the intercept
# Compute the odds ratios
odds_ratios <- exp(coefficients[-1])
names(odds_ratios) <- predictors
# Create a bar plot of the odds ratios
barplot(odds_ratios, main = "Effects of Factors on Stroke Risk",
xlab = "Predictors", ylab = "Odds Ratio", col = "blue", ylim = c(0, max(odds_ratios) * 1.2))
# Add error bars indicating the 95% confidence intervals
errors <- coef(summary(model))[-1, "Std. Error"]
arrows(x0 = 1:length(predictors), y0 = odds_ratios - 1.96 * errors,
x1 = 1:length(predictors), y1 = odds_ratios + 1.96 * errors,
angle = 90, code = 3, length = 0.05)